#' 
#' reading in:
#'   manifesto_party_year.csv  (written out by 2-manifesto-hawkdove.R)
#'   whogov_clean.csv  (written out by 3-whogov-cleaning.R)
#'   WhoGov_crosssectional_V2.0.csv  (original data from WhoGov)
#'   partyfacts-external-parties.csv  (original data from PartyFacts)
#'   NELDA.xls  (original data from NELDA)
#'   (using peacesciencer package to merge in polity and vdem variables)
#' merging:
#'   merging partyfacts_id into m and w
#'   merging m vars into w, matchig on partyfacts_id
#'   collapsing from officer-year level to country-year level, creating wcy
#'   merging polity/vdem values into wcy
#'   merging election data from nelda into wcy
#' writing out:
#'   country_coverage.tex table
#'   data_for_analysis.csv
#' 


m = read_csv('process_data/manifesto_party_year.csv')
w = read_csv("process_data/whogov_clean.csv")

# whogov cross-sectional, just for the n_party variable
wcs = read_csv('source_data/WhoGov_crosssectional_V2.0.csv')


##### POLITY & VDEM ####

# creating country-year data with vdem_polyarchy and polity2 scores
# for analysis, can restrict sample to polity2 >= 6 (or ~equivalent vdem cutoff)

# creating dataset from the peacesciencer package
polity_dat = create_stateyears() |> add_democracy() |> 
  mutate(country_isocode = countrycode(ccode, origin='cown',destination='iso3c')) 

# polity2 >=6 is the standard cutoff for democracy
# want to find a corresponding value for polity2

polity_vdem_mod = lm(v2x_polyarchy ~ polity2, data=polity_dat)
(vdem_polity6_equiv = polity_vdem_mod$coef[1] + polity_vdem_mod$coef[2]*6)
# 0.55

# coding a binary democracy measure:
#  polity >=6, and polyarchy >= 0.55 (or only one if the other is a missing value)
polity_dat = polity_dat |>
  rowwise() |>
  mutate(
    dem = case_when(
      is.na(polity2) & is.na(v2x_polyarchy) ~ NA_real_,
      T ~ min(polity2 >= 6 , v2x_polyarchy >=vdem_polity6_equiv, na.rm=T)
    )
  )

# fill dem value forward in time (polity ends in 2017, vdem ends in 2019)
polity_dat_filled = polity_dat |>
  group_by(country_isocode) |>
  tidyr::fill(dem, .direction = 'down') |>
  ungroup() |>
  filter(year >= 1963)



##### MERGING WHOGOV AND MANIFESTO ######


# partyfacts links the three sets of party identifiers we're using:
#  partyfacts_id, whogov_partyid, and manifesto_partyid
partyfacts = read_csv('source_data/partyfacts-external-parties.csv',
                      guess_max=50000)  |> filter(! is.na(partyfacts_id))

# pfacts_m: linking manifesto_partyid with partyfacts_id
pfacts_m = partyfacts |> 
  filter(dataset_key == "manifesto") |>
  rename(manifesto_partyid = dataset_party_id) |>
  mutate(manifesto_partyid = manifesto_partyid |> as.numeric())

# pfacts_w: linking whogov_partyid with partyfacts_id
pfacts_w = partyfacts |> 
  filter(dataset_key == "whogov")|>
  rename(whogov_partyid = dataset_party_id)


# adding partyfacts_id into m, matching by manifesto_partyid
m = m |> left_join(pfacts_m |> 
                     select(manifesto_partyid,partyfacts_id)
) 


# adding in partyfacts_id into w, matching by whogov_partyid
w = w |> left_join(
  pfacts_w |> select(whogov_partyid, partyfacts_id)
)

# drop the duplicated party-years, i.e. when there are multiple elections in a year (keep the earlier one)
m = m |> 
  mutate(pfid_yr = paste(partyfacts_id, year, sep='_')) |>
  filter(!duplicated(pfid_yr))


# MANUAL CORRECTIONS ON partyfacts_id in w

w = w |> mutate(
  partyfacts_id = case_when(
    
    # Germany: CDU, 1375; CSU
    #   only appear in manifesto as "Christian Democratic Union/Christian Social Union", 211
    partyfacts_id %in% c(1375, 1731) ~ 211,
    
    # Belgium: Christian Democrats and Flemish, 
    partyfacts_id == 604 ~ 622,
    
    # France: 
    #   Union for the New Republic, 5514, and Union of Democrats for the Republic, 8041
    #  "Union for a New Majority - Gaullists/Conservatives" in manifesto, party=31621, partyfacts_id = 1083
    partyfacts_id %in% c(5514, 8041) ~ 1083,
    
    # Israel: 
    #  "Alignment | Israeli Labor Party | One Israel", 1436, changing to "Alignment", 615
    partyfacts_id == 1436 & year < 1988 ~ 615,
    
    # Switzerland:
    #  "FDP.The Liberals" has partyfacts_id = 1231 in pfacts_w, but partyfacts_id = 360 in pfacts_m
    #  (in manifesto, appears as "Radical Democratic Party")
    partyfacts_id == 1231 ~ 360,
    
    
    # Romania:
    #  National Salvation Front (partyfacts_id=958) is party=93221, which links to partyfacts_id = 1715 in pfacts_m
    partyfacts_id == 958 ~ 1715,
    
    T ~ partyfacts_id
  )
)


#### MERGE MANIFESTO HAWK VARS INTO W ####

m |> select(country_isocode, year) |> n_distinct() # 2902
w |> select(country_isocode, year) |> n_distinct() # 3009

# checking each country's coverage in each dataset
m_num_yr_by_iso = m |> group_by(country_isocode) |> summarise(n_m = n_distinct(year)) |> ungroup()
w_num_yr_by_iso = w |> group_by(country_isocode) |> summarise(n_w = n_distinct(year)) |> ungroup()

m_num_yr_by_iso |> left_join(
  w_num_yr_by_iso
) |> print(n=Inf)

# (2902 being close to 3009 is purely coincidental)

# merge everything from m into w
w = w |> left_join(
  m, by = c('partyfacts_id','year', 'country_isocode')
)


##### AGGREGATING TO COUNTRY-YEAR ######

# record a bunch of country-year level vars re: leader, mindef, and mfa
wcy = w |> 
  group_by(country_isocode, year) |>
  summarise(
    
    leader_name = name[leader==1],
    # is the national leader the prime minister
    leader_pm = as.numeric(str_detect(position[leader==1], 'Prime Min')), 
    
    leader_hawk = above_median_hawk[leader==1],
    leader_dove = below_median_hawk[leader==1],
    leader_party = partyfacts_id[leader==1],
    leader_indep = as.numeric(party_english[leader==1]=='independent'),
    
    mindef_hawk = above_median_hawk[m_defense==1],
    mindef_dove = below_median_hawk[m_defense==1],
    mindef_party = partyfacts_id[m_defense==1],
    mindef_indep = as.numeric(party_english[m_defense==1]=='independent'),
    
    mfa_hawk = above_median_hawk[m_foreignaffairs==1],
    mfa_dove = below_median_hawk[m_foreignaffairs==1],
    mfa_party = partyfacts_id[m_foreignaffairs==1],
    mfa_indep = as.numeric(party_english[m_foreignaffairs==1]=='independent')
    
  ) |> ungroup()


# hawkdove and sameparty variables for each office
wcy = wcy |> mutate(
  leader_hawkdove = case_when(
    leader_hawk==1 ~ 'H',
    leader_dove==1 ~ 'D',
    leader_indep==1 ~ 'I',
    leader_hawk==0 & leader_dove==0 ~ 'C'
  ),
  mindef_hawkdove = case_when(
    mindef_hawk==1 ~ 'H',
    mindef_dove==1 ~ 'D',
    mindef_indep==1 ~ 'I',
    mindef_hawk==0 & mindef_dove==0 ~ 'C'
  ),
  mfa_hawkdove = case_when(
    mfa_hawk==1 ~ 'H',
    mfa_dove==1 ~ 'D',
    mfa_indep==1 ~ 'I',
    mfa_hawk==0 & mfa_dove==0 ~ 'C'
  ),
  
  mindef_sameparty = as.numeric(leader_party==mindef_party & leader_indep==0),
  mfa_sameparty = as.numeric(leader_party==mfa_party & leader_indep==0),
  
)



# merging in the n_party variable from wcs,
#  and creating maj_parl variable
wcy = wcy |> left_join(
  wcs |> select(year, country_isocode, n_party)
) |>
  mutate(
    # is this a majority government in a parliamentary system
    maj_parl = leader_pm*(n_party < 2),
  )


# merge in the democracy dummy from the polity/vdem data
wcy = wcy |> left_join(
  polity_dat_filled |> select(country_isocode, year, dem)
) 

# which obs didn't get a dem value
wcy |> filter(is.na(dem)) |> pull(country_isocode) |> table()
# CZE DDR DEU SRB 
# 27  24  24  16 

# fixing these
wcy = wcy |>
  mutate(
    dem = case_when(
      country_isocode=='CZE' & year < 1990 ~ 0,
      country_isocode=='CZE' & year >= 1990 ~ 1,
      country_isocode=='DEU' ~ 1,
      country_isocode=='DDR' ~ 0,
      country_isocode=='SRB' & year > 2000 ~ 1,
      T ~ dem
      
    )
  )



##### MERGE IN NELDA #####

n_full = read_xls('source_data/NELDA 6.0/NELDA.xls')

#' interested only in elections in which incumbent is eligible for reelection
#'   nelda8: Did the incumbent reach his or her term limit?
#'   nelda20: Was the office of the incumbent leader contested in this election?
n = n_full %>% subset(nelda8=='no' & nelda20=='yes') # 1881 out of 4158

# dropping some countries that don't appear in whogov/manifesto
n = n |> filter(!country %in% c('Abkhazia','South Ossetia', 'Republic of Vietnam',
                                'Kosovo','South Yemen'))

n = n |> mutate(
  country_isocode = case_when(
    stateid %in% c('CZE', 'CZR') ~ 'CZE',
    T ~ countrycode(country, origin='country.name',destination='iso3c')
  )
)


# adding month variable
n = n |> mutate(
  month = case_when(
    nchar(mmdd)==4 ~ substr(mmdd,1,2),
    nchar(mmdd)==3 ~ substr(mmdd,1,1)
  )
)


# initialize variables in wcy for: upcoming elec in 1 or 2 years

wcy$elec1 = 0; wcy$elec2 = 0 

w_iso = w |> pull(country_isocode) |> unique() |> sort()

# iterate through elections
for(i in 1:nrow(n)){
  if(i%%100==1){cat(i,' ')}
  
  # store country name, and month/year
  cn = n$country_isocode[i]
  if(!cn %in% w_iso){next}
  
  mo = n$month[i] |> as.numeric()
  y = n$year[i] |> as.numeric()
  
  if(y < 1963){next}
  
  # whogov measured in july of each year (except jan in 1970)
  #  want to know if election is upcoming later in the year
  # so, match this election to the previous year in wcy
  if(mo<8 & y != 1970){y=y-1}
  
  # row in wcy corresponding to the year 
  wrow1 = which(wcy$country_isocode==cn & wcy$year==y)
  if(length(wrow1)!=1){next}
  wcy$elec1[wrow1]=1
  
  wrow2 = which(wcy$country_isocode==cn & wcy$year==y-1)
  if(length(wrow2)!=1){wrow2=NULL}
  wcy$elec2[c(wrow1,wrow2)]=1
  
  
}

table(wcy$elec1) # 570 x 1, 2358 x 0 
table(wcy$elec2) # 1082 x 1, 1846 x 0 












#### SUMMARY STATS #####

table(is.na(w$above_median_hawk)) # 44,756 not NA
table(w$party_english=='independent') # 10,631 


w |> group_by(leader, m_defense, m_foreignaffairs) |> 
  summarise(n_hawkdove = sum(!is.na(above_median_hawk)),
            n_indep = sum(party=='independent', na.rm=T))

#      leader m_defense m_foreignaffairs n_hawkdove n_indep
#       <dbl>     <dbl>            <dbl>      <int>   <int>
#   1      0         0                0      39158    9273
#   2      0         0                1       1806     522
#   3      0         1                0       1745     543
#   4      0         1                1         57       0
#   5      1         0                0       1990     293


wcy |> select(leader_hawkdove) |> table()
#   C   D   H   I 
# 743 482 727 292 
wcy |> filter(leader_hawkdove %in% c('C','D','H')) |> nrow() # 1952


wcy |> filter(!is.na(mindef_hawkdove)) |> select(leader_hawkdove) |> table()
#   C   D   H   I 
# 718 453 712 265 
wcy |> filter(!is.na(mindef_hawkdove) & leader_hawkdove %in% c('C','D','H')) |> nrow() # 1883

wcy |> filter(!is.na(mfa_hawkdove)) |> select(leader_hawkdove) |> table()
#   C   D   H   I 
# 723 464 720 275 
wcy |> filter(!is.na(mfa_hawkdove) & leader_hawkdove %in% c('C','D','H')) |> nrow() # 1907

# sample composition for main text analysis
wcy |> 
  filter(maj_parl==0 & 
           leader_hawkdove %in% c('C','D','H') &
           !is.na(mindef_hawkdove)) |>
  group_by(country_isocode) |>
  summarise(
    n=n() |> as.integer(), 
    first_yr = min(year) |> as.integer(), 
    last_year = max(year) |> as.integer()
  ) |>
  rename(
    country = country_isocode
  ) |>
  mutate(
    country = country |> countrycode(origin='iso3c',destination='country.name')
  ) |>
  arrange(country) |>
  xtable() |> print(include.rownames=F, file='tables/country_coverage.tex')


##### WRITING OUT #####

wcy |> write_csv('data_for_analysis.csv')

rm(list=ls()[ls()!='replication_wd'])


